library(ggpubr)
Loading required package: magrittr

Attaching package: ‘magrittr’

The following object is masked from ‘package:purrr’:

    set_names

The following object is masked from ‘package:tidyr’:

    extract


Attaching package: ‘ggpubr’

The following object is masked from ‘package:cowplot’:

    get_legend
f74.rna.seu@meta.data <- f74.rna.seu@meta.data %>%
  rownames_to_column("cell") %>%
  left_join(annotation.df) %>%
  column_to_rownames("cell")
Joining, by = "cell"

First I have made the snap file from the cellranger fragments.tsv file. Script is multiOmic_benchmark/preprocess/fragments2snap.sh.

Following integration vignette

Barcode selection

Filtering based on number of reads per cell and ratio of fragments that are within promoters

barcode.ls = lapply(seq(snap.files), function(i){
  barcodes = barcode.ls[[i]];
  idx = which(
      barcodes$logUMI >= cutoff.logUMI.low[i] & 
      barcodes$logUMI <= cutoff.logUMI.high[i] & 
      barcodes$promoter_ratio >= cutoff.FRIP.low[i] &
      barcodes$promoter_ratio <= cutoff.FRIP.high[i]
  );
  barcodes[idx,]
});
x.sp.ls = lapply(seq(snap.files), function(i){
  barcodes = barcode.ls[[i]];
  x.sp = x.sp.ls[[i]];
  barcode.shared = intersect(x.sp@barcode, barcodes$barcode);
  x.sp = x.sp[match(barcode.shared, x.sp@barcode),];
  barcodes = barcodes[match(barcode.shared, barcodes$barcode),];
  x.sp@metaData = barcodes;
  x.sp
})
names(x.sp.ls) = sample.names;
x.sp = Reduce(snapRbind, x.sp.ls);
x.sp@metaData["sample"] = x.sp@sample;
x.sp
number of barcodes: 6043
number of bins: 0
number of genes: 0
number of peaks: 0
number of motifs: 0

Add cell-by-bin matrix

x.sp = addBmatToSnap(x.sp, bin.size = 5000)
Epoch: reading cell-bin count matrix session ...

Binarize matrix

Some items in the count matrix have abnormally high coverage perhaps due to the alignment errors. Therefore, we next remove top 0.1% items in the count matrix and then convert the remaining non-zero values to 1.

x.sp = makeBinary(x.sp, mat="bmat")
x.sp
number of barcodes: 6043
number of bins: 651791
number of genes: 0
number of peaks: 0
number of motifs: 0

Filter bins

Filter out bins overlapping w ENCODE blacklist

black_list = read.table("~/annotations/hg38.blacklist.bed.gz")
black_list.gr = GRanges(
  black_list[,1], 
  IRanges(black_list[,2], black_list[,3])
);
idy = queryHits(
  findOverlaps(x.sp@feature, black_list.gr)
);
if(length(idy) > 0){
  x.sp = x.sp[,-idy, mat="bmat"];
};
x.sp
number of barcodes: 6043
number of bins: 651765
number of genes: 0
number of peaks: 0
number of motifs: 0

Exclude bad chromosomes

chr.exclude = seqlevels(x.sp@feature)[grep("random|chrM", seqlevels(x.sp@feature))]
idy = grep(paste(chr.exclude, collapse="|"), x.sp@feature)
if(length(idy) > 0){
  x.sp = x.sp[,-idy, mat="bmat"]
}
x.sp
number of barcodes: 6043
number of bins: 650342
number of genes: 0
number of peaks: 0
number of motifs: 0

remove the top 5% bins that overlap with invariant features such as the house keeping gene promoters

bin.cov = log10(Matrix::colSums(x.sp@bmat)+1)
# bin.cov = Matrix::colSums(x.sp@bmat)
hist(
  bin.cov[bin.cov > 0], 
  xlab="log10(bin cov)", 
  main="log10(Bin Cov)", 
  col="lightblue", 
  # xlim=c(0, 5),
  breaks=100
);

bin.cutoff = quantile(bin.cov[bin.cov > 0], 0.95)
idy = which(bin.cov <= bin.cutoff & bin.cov > 0)
x.sp = x.sp[, idy, mat="bmat"];
x.sp
number of barcodes: 6043
number of bins: 539584
number of genes: 0
number of peaks: 0
number of motifs: 0

Remove any cells of bin coverage less than 1,000. The rational behind this is that some cells may have high number of unique fragments but end up with low bin coverage after filtering. This step is optional but highly recommanded.

idx = which(Matrix::rowSums(x.sp@bmat) > 1000);
x.sp = x.sp[idx,];
x.sp
number of barcodes: 5793
number of bins: 539584
number of genes: 0
number of peaks: 0
number of motifs: 0

Dimensionality reduction

Uses diffusion map algorithm w sampling technique to make it fast.

## Sample 100 cells as landmarks 
row.covs.dens <- density(
  x = x.sp@metaData[,"logUMI"], 
  bw = 'nrd', adjust = 1
)
sampling_prob <- 1 / (approx(x = row.covs.dens$x, y = row.covs.dens$y, xout = x.sp@metaData[,"logUMI"])$y + .Machine$double.eps);
set.seed(1)
idx.landmark.ds <- sort(sample(x = seq(nrow(x.sp)), size = 1000, prob = sampling_prob))
## Split between landmark and query cells
x.landmark.sp = x.sp[idx.landmark.ds,];
x.query.sp = x.sp[-idx.landmark.ds,];
## Run diffusion map on landmark
x.landmark.sp = runDiffusionMaps(
  obj= x.landmark.sp,
  input.mat="bmat", 
  num.eigs=50
);
Epoch: checking the inputs ...
Epoch: computing jaccard similarity matrix ...
Epoch: fitting regression model ...
Epoch: performing normalization ...
Epoch: computing eigen decomposition ...
Epoch: Done
x.landmark.sp@metaData$landmark = 1;
## Project query cells
x.query.sp = runDiffusionMapsExtension(
  obj1=x.landmark.sp, 
  obj2=x.query.sp,
  input.mat="bmat"
)
Epoch: checking the inputs ...
Epoch: computing jaccard similarity matrix ...
Epoch: performing normalization ...
Epoch: projecting query cells to the reference ...
Epoch: Done
x.query.sp@metaData$landmark = 0;
## Combine 
x.sp = snapRbind(x.landmark.sp, x.query.sp);
'rBind' is deprecated.
 Since R version 3.2.0, base's rbind() should work fine with S4 objects
x.sp = x.sp[order(x.sp@metaData[,"sample"])]; #IMPORTANT

To determine significant diffusion components: > We use an ad hoc method by simply looking at a pairwise plot and select the number of eigen vectors that the scatter plot starts looking like a blob. In the below example, we choose the first 15 eigen vectors.

plotDimReductPW(
    obj=x.sp, 
    eigs.dims=1:50,
    point.size=0.3,
    point.color="grey",
    point.shape=19,
    point.alpha=0.6,
    down.sample=5000,
    pdf.file.name=NULL, 
    pdf.height=7, 
    pdf.width=7
  );

Clustering and visualization

x.sp=runCluster(
    obj=x.sp,
    tmp.folder=tempdir(),
    louvain.lib="leiden",
    seed.use=10,
    resolution=0.7
  );
Epoch: checking input parameters
Epoch: finding clusters using leiden

Visualization

Make gene matrix

saveRDS("~/my_data/cellranger-atac110_count_30439_WSSS8038360_GRCh38-1_1_0.snapATAC.RDS")
Error in saveRDS("~/my_data/cellranger-atac110_count_30439_WSSS8038360_GRCh38-1_1_0.snapATAC.RDS") : 
  'file' must be non-empty string

Repeat visualization on gmat

FeaturePlot(f74.seu, features = c('PTPRC','CD4','CD8A','CD8B','CD79A','FOXN1','EPCAM','PDGFRA','GNG4', 'FOXP3','RAG1','RAG2','NKG7','CCR7'), reduction = "umap.snap")
All cells have the same value (0) of CD79A.All cells have the same value (0) of RAG2.All cells have the same value (0) of NKG7.All cells have the same value (0) of CCR7.
DimHeatmap(f74.seu, dims = 1:6, balanced = TRUE,ncol = 1)

31 Oct 2019 09:28:40 [rsession-] CLIENT EXCEPTION (rsession-jovyan): (TypeError) : Cannot read property 'a' of null;
org/rstudio/studio/client/workbench/views/source/editors/text/ChunkPlotPage.java#52::execute
org/rstudio/studio/client/workbench/views/source/editors/text/ChunkPlotWidget.java#49::onBrowserEvent
com/google/gwt/user/client/DOM.java#1414::dispatchEvent
com/google/gwt/user/client/impl/DOMImplStandard.java#312::dispatchEvent
com/google/gwt/user/client/impl/DOMImplStandard.java#334::dispatchUnhandledEvent
com/google/gwt/core/client/impl/Impl.java#244::apply
com/google/gwt/core/client/impl/Impl.java#283::entry0
rstudio-0.js#-1::eval
Client-ID: 7beb3877-891d-4e16-98d5-f8cca1f72d5d
User-Agent: Mozilla/5.0 (Macintosh  Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36

Differential analysis

Expression of markers in RNA data

map(1:10, ~ plot_cluster_marker_ATACvsRNA(.x))
All cells have the same value (0) of MIR4300HG.All cells have the same value (0) of NOS1.All cells have the same value (0) of RPH3A.Cannot convert object of class listggarrange into a grob.All cells have the same value (0) of C5orf66-AS2.All cells have the same value (0) of CYP3A43.
[[1]]

[[2]]

[[3]]

[[4]]

[[5]]

[[6]]

[[7]]

[[8]]

[[9]]

[[10]]

Embedding on most variable genes in ATAC (not in gene expression)

Thoughts

---
title: "Snap ATAC data processing"
output: html_notebook
---

```{r}
suppressPackageStartupMessages({
  library(SnapATAC)
  library(leiden)
  library(umap)
  library(GenomicRanges)
  library(zoo)
  library(tidyverse)
  library(cowplot)
  library(ggpubr)
})
source("~/multiOmic_benchmark/preprocess/selectFeatures.R")
```
```{r}
## Load and embed RNA data
f74.sce.list <- readRDS("~/my_data/integrated_thymus/F74_SCElist_20191017.RDS")
f74.rna <- f74.sce.list$RNA

f74.rna.seu <- as.Seurat(f74.rna, verbose=FALSE)
integrate_features <- HVG_Seurat(f74.rna, nfeatures = 3000)
VariableFeatures(f74.rna.seu) <- integrate_features
f74.rna.seu <- ScaleData(f74.rna.seu, do.center = TRUE, verbose=FALSE)
f74.rna.seu <- RunPCA(f74.rna.seu) 
f74.rna.seu <- RunUMAP(f74.rna.seu, reduction = "pca", dims=1:50) 

## Add cell type annotation
annotation.df <- read.csv("~/my_data/F74_RNA_obs.csv")
annotation.df <- annotation.df %>%
  mutate(cell=str_remove(as.character(X), "F74_1_") %>% str_c(ifelse(batch==0,'_1', "_2"))) 

f74.rna.seu@meta.data <- f74.rna.seu@meta.data %>%
  rownames_to_column("cell") %>%
  left_join(annotation.df) %>%
  column_to_rownames("cell")

```


First I have made the snap file from the cellranger `fragments.tsv` file. Script is `multiOmic_benchmark/preprocess/fragments2snap.sh`.

Following [integration vignette](https://github.com/r3fang/SnapATAC/blob/master/examples/10X_PBMC_15K/README.md) 

### Barcode selection
Filtering based on number of reads per cell and ratio of fragments that are within promoters
```{r}
snap.files <- "~/my_data/cellranger-atac110_count_30439_WSSS8038360_GRCh38-1_1_0.snap"
sample.names <- "F74"
barcode.files <- '~/my_data/singlecell.csv'

x.sp.ls <- list(createSnap(snap.files, sample.names))
names(x.sp.ls) = sample.names
x.sp.ls

barcode.ls = lapply(seq(snap.files), function(i){
    barcodes = read.csv(
        barcode.files[i], 
        head=TRUE
    );
    # remove NO BAROCDE line
    barcodes = barcodes[2:nrow(barcodes),];
    barcodes$logUMI = log10(barcodes$passed_filters + 1);
    barcodes$promoter_ratio = (barcodes$promoter_region_fragments+1) / (barcodes$passed_filters + 1);
    barcodes
  })

plots = lapply(seq(snap.files), function(i){
    p1 = ggplot(
        barcode.ls[[i]], 
        aes(x=logUMI, y=promoter_ratio)) + 
        geom_point(size=0.3, col="grey") +
        theme_classic()	+
        ggtitle(sample.names[[i]]) +
        ylim(0, 1) + xlim(0, 6) + 
        labs(x = "log10(counts)", y="promoter ratio")
        p1
    })

## Select and viz cutoffs
cutoff.logUMI.low = 3.3
cutoff.logUMI.high = 4.8
cutoff.FRIP.low = 0.25
cutoff.FRIP.high = 0.65


plots[[1]] + 
  geom_vline(xintercept = c(cutoff.logUMI.low[1],cutoff.logUMI.high[1]), linetype=2) +
  geom_hline(yintercept = c(cutoff.FRIP.low[1],cutoff.FRIP.high[1]), linetype=2)
```
```{r}
barcode.ls = lapply(seq(snap.files), function(i){
  barcodes = barcode.ls[[i]];
  idx = which(
      barcodes$logUMI >= cutoff.logUMI.low[i] & 
      barcodes$logUMI <= cutoff.logUMI.high[i] & 
      barcodes$promoter_ratio >= cutoff.FRIP.low[i] &
      barcodes$promoter_ratio <= cutoff.FRIP.high[i]
  );
  barcodes[idx,]
});
x.sp.ls = lapply(seq(snap.files), function(i){
  barcodes = barcode.ls[[i]];
  x.sp = x.sp.ls[[i]];
  barcode.shared = intersect(x.sp@barcode, barcodes$barcode);
  x.sp = x.sp[match(barcode.shared, x.sp@barcode),];
  barcodes = barcodes[match(barcode.shared, barcodes$barcode),];
  x.sp@metaData = barcodes;
  x.sp
})
names(x.sp.ls) = sample.names;
x.sp = Reduce(snapRbind, x.sp.ls);
x.sp@metaData["sample"] = x.sp@sample;
x.sp
```

### Add cell-by-bin matrix
```{r}
x.sp = addBmatToSnap(x.sp, bin.size = 5000)
```
## Binarize matrix
Some items in the count matrix have abnormally high coverage perhaps due to the alignment errors. Therefore, we next remove top 0.1% items in the count matrix and then convert the remaining non-zero values to 1.
```{r}
x.sp = makeBinary(x.sp, mat="bmat")
x.sp
```

## Filter bins
Filter out bins overlapping w ENCODE blacklist

```{r}
black_list = read.table("~/annotations/hg38.blacklist.bed.gz")
black_list.gr = GRanges(
  black_list[,1], 
  IRanges(black_list[,2], black_list[,3])
);
idy = queryHits(
  findOverlaps(x.sp@feature, black_list.gr)
);
if(length(idy) > 0){
  x.sp = x.sp[,-idy, mat="bmat"];
};
x.sp
```

Exclude bad chromosomes
```{r}
chr.exclude = seqlevels(x.sp@feature)[grep("random|chrM", seqlevels(x.sp@feature))]
idy = grep(paste(chr.exclude, collapse="|"), x.sp@feature)
if(length(idy) > 0){
  x.sp = x.sp[,-idy, mat="bmat"]
}
x.sp
```

remove the top 5% bins that overlap with invariant features such as the house keeping gene promoters

```{r}
bin.cov = log10(Matrix::colSums(x.sp@bmat)+1)
# bin.cov = Matrix::colSums(x.sp@bmat)
hist(
  bin.cov[bin.cov > 0], 
  xlab="log10(bin cov)", 
  main="log10(Bin Cov)", 
  col="lightblue", 
  # xlim=c(0, 5),
  breaks=100
);
bin.cutoff = quantile(bin.cov[bin.cov > 0], 0.95)
idy = which(bin.cov <= bin.cutoff & bin.cov > 0)
x.sp = x.sp[, idy, mat="bmat"];
x.sp
```

Remove any cells of bin coverage less than 1,000. The rational behind this is that some cells may have high number of unique fragments but end up with low bin coverage after filtering. This step is optional but highly recommanded.
```{r}
idx = which(Matrix::rowSums(x.sp@bmat) > 1000);
x.sp = x.sp[idx,];
x.sp
```

## Dimensionality reduction
Uses diffusion map algorithm w sampling technique to make it fast.

```{r}
## Sample 100 cells as landmarks 
row.covs.dens <- density(
  x = x.sp@metaData[,"logUMI"], 
  bw = 'nrd', adjust = 1
)
sampling_prob <- 1 / (approx(x = row.covs.dens$x, y = row.covs.dens$y, xout = x.sp@metaData[,"logUMI"])$y + .Machine$double.eps);
set.seed(1)
idx.landmark.ds <- sort(sample(x = seq(nrow(x.sp)), size = 1000, prob = sampling_prob))

## Split between landmark and query cells
x.landmark.sp = x.sp[idx.landmark.ds,];
x.query.sp = x.sp[-idx.landmark.ds,];

## Run diffusion map on landmark
x.landmark.sp = runDiffusionMaps(
  obj= x.landmark.sp,
  input.mat="bmat", 
  num.eigs=50
);
x.landmark.sp@metaData$landmark = 1;

## Project query cells
x.query.sp = runDiffusionMapsExtension(
  obj1=x.landmark.sp, 
  obj2=x.query.sp,
  input.mat="bmat"
)
x.query.sp@metaData$landmark = 0;

## Combine 
x.sp = snapRbind(x.landmark.sp, x.query.sp);
x.sp = x.sp[order(x.sp@metaData[,"sample"])]; #IMPORTANT
```

To determine significant diffusion components:
> We use an ad hoc method by simply looking at a pairwise plot and select the number of eigen vectors that the scatter plot starts looking like a blob. In the below example, we choose the first 15 eigen vectors.

```{r}
plotDimReductPW(
    obj=x.sp, 
    eigs.dims=1:50,
    point.size=0.3,
    point.color="grey",
    point.shape=19,
    point.alpha=0.6,
    down.sample=5000,
    pdf.file.name=NULL, 
    pdf.height=7, 
    pdf.width=7
  );
```

## Clustering and visualization
```{r}
signif.dims = 1:16
x.sp = runKNN(
    obj=x.sp,
    eigs.dims=signif.dims,
    k=15
  );

x.sp=runCluster(
    obj=x.sp,
    tmp.folder=tempdir(),
    louvain.lib="leiden",
    seed.use=10,
    resolution=0.7
  );
```

Visualization
```{r, fig.height=5, fig.width=5}
x.sp = runViz(
 obj=x.sp, 
 tmp.folder=tempdir(),
 dims=2,
 eigs.dims=signif.dims, 
 method="umap",
 seed.use=10
)

plotViz(
    obj= x.sp,
    method="umap", 
    main="Cluster",
    point.color=x.sp@cluster, 
    point.size=0.2, 
    point.shape=19, 
    text.add=TRUE,
    text.size=1,
    text.color="black",
    down.sample=10000,
    legend.add=FALSE
  );

```


<!-- ## Outlier analysis -->
<!-- ```{r} -->
<!-- data.frame(x.sp@umap) %>% -->
<!--   mutate(cell=x.sp@barcode) %>% -->
<!--   # arrange(umap.1) -->
<!--   ggplot(aes(umap.1, umap.2)) + -->
<!--   geom_point(size=0.2) -->
<!-- ``` -->
<!-- ```{r, fig.height=5, fig.width=15} -->
<!-- outlier.ix.1 <- data.frame(x.sp@umap) %>% -->
<!--   mutate(cell=x.sp@barcode) %>% -->
<!--   rowid_to_column() %>% -->
<!--   dplyr::filter(umap.1 > 100) %>% -->
<!--   pull(rowid) -->
<!-- outlier.ix.2 <- data.frame(x.sp@umap) %>% -->
<!--   mutate(cell=x.sp@barcode) %>% -->
<!--   rowid_to_column() %>% -->
<!--   dplyr::filter(umap.2 < -50) %>% -->
<!--   pull(rowid) -->

<!-- smp.other <- sample(setdiff(1:length(x.sp@barcode), c(outlier.ix.1, outlier.ix.2)), 100) -->

<!-- long.outlier.bmat <- -->
<!--   x.sp@bmat[c(outlier.ix.1, outlier.ix.2, smp.other),] %>% -->
<!--   as.matrix() %>% -->
<!--   melt(value.name = "acc", varnames=c("barcode", "bin")) -->

<!-- # long.outlier.bmat %>% -->
<!-- #   # dplyr::filter(barcode=='AACAAAGGTAGACGCA-1') %>% -->
<!-- #   mutate(big.bins=cut(bin,breaks = 10000)) %>% -->
<!-- #   group_by(barcode) %>% -->
<!-- #   arrange(bin) %>% -->
<!-- #   mutate(runmean=rollmean(acc,k = 10000)[1:n()]) %>% -->
<!-- #   ungroup() %>% -->
<!-- #   group_by(barcode, big.bins) %>% -->
<!-- #   summarise(bin.mean=mean(runmean)) %>% -->
<!-- #   mutate(big.bins=as.numeric(big.bins)) %>% -->
<!-- #   mutate(outlier=ifelse(barcode %in% x.sp@barcode[outlier.ix], TRUE, FALSE)) %>% -->
<!-- #   ggplot(aes( big.bins, barcode, fill=bin.mean)) + -->
<!-- #   geom_tile() + -->
<!-- #   scale_fill_viridis_c() + -->
<!-- #   facet_wrap(outlier~., scales="free_y", ncol=1, nrow=2) -->
<!-- #   ungroup() %>% -->
<!-- #   ggplot(aes(bin, runmean)) + -->
<!-- #   facet_wrap(big.bins~., scale="free_x", ncol=2) + -->
<!-- #   geom_line(aes(group=barcode, color=outlier), size=0.5, alpha=0.5) -->
<!-- #    -->
<!-- ``` -->

<!-- ```{r} -->
<!-- long.outlier.bmat %>% -->
<!--   group_by(barcode) %>% -->
<!--   summarise(acc=sum(acc)) %>% -->
<!--   mutate(outlier=case_when(barcode %in% x.sp@barcode[outlier.ix.1] ~  "out1",  -->
<!--                            barcode %in% x.sp@barcode[outlier.ix.2] ~ "out2", -->
<!--                            FALSE ~ 'other')) %>% -->
<!--   # mutate(outlier=ifelse(barcode %in% x.sp@barcode[outlier.ix.2], "out2", FALSE)) %>% -->
<!--   ggplot(aes(outlier, acc)) + -->
<!--   ggbeeswarm::geom_quasirandom() -->
<!-- ``` -->


## Make gene matrix
```{r}
transcripts.gr = rtracklayer::import("~/Homo_sapiens.GRCh38.93.filtered.gtf")
colnames(transcripts.gr@elementMetadata) <- str_replace(colnames(transcripts.gr@elementMetadata), "gene_name", "name")

genes.gr <- unlist(range(split(transcripts.gr, ~ name)))  ## From transcripts to genes
genes.gr$name <- names(genes.gr)

if (GenomeInfoDb::seqlevelsStyle(genes.gr) != GenomeInfoDb::seqlevelsStyle(x.sp@feature) ) {
  GenomeInfoDb::seqlevelsStyle(genes.gr) <- GenomeInfoDb::seqlevelsStyle(x.sp@feature)
}

x.sp = createGmatFromMat(
    obj=x.sp, 
    input.mat="bmat",
    genes=genes.gr,
    do.par=TRUE,
    num.cores=10
  )

saveRDS(x.sp, file = "~/my_data/cellranger-atac110_count_30439_WSSS8038360_GRCh38-1_1_0.snapATAC.RDS")

```

Repeat visualization on gmat
```{r}
## Make suerat object
f74.seu <- snapToSeurat(
    obj=x.sp, 
    eigs.dims=1:20, 
    norm=TRUE,
    scale=TRUE
    )

## Save UMAP based on SnapATAC processing
f74.seu <- RunUMAP(f74.seu, reduction = "SnapATAC", dims=signif.dims, reduction.name = "umap.snap")


f74.seu <- FindVariableFeatures(f74.seu)
f74.seu <- RunPCA(f74.seu, dims=1:50)
f74.seu <- RunUMAP(f74.seu, reduction = "pca", dims=1:50, verbose=FALSE) 

ggpubr::ggarrange(
  DimPlot(f74.seu, reduction = "umap", group.by = NULL) + ggtitle("gmat"),
  DimPlot(f74.seu, reduction = "umap.snap", group.by = NULL) + ggtitle("bmat - snap") 
  )

```


```{r}
plotly::ggplotly(DimPlot(f74.seu, reduction = "umap", group.by = NULL) + ggtitle("gmat"))
```

```{r, fig.height=15, fig.width=15}
FeaturePlot(f74.seu, features = c('PTPRC','CD4','CD8A','CD8B','CD79A','FOXN1','EPCAM','PDGFRA','GNG4', 'FOXP3','RAG1','RAG2','NKG7','CCR7'), reduction = "umap.snap")
```

```{r, fig.height=15, fig.width=15}
DimHeatmap(f74.seu, dims = 1:6, balanced = TRUE,ncol = 1)
```

### Differential analysis 
<!-- Snap ATAC way -->
<!-- ```{r} -->
<!-- cluster = 1 -->
<!-- DARs= findDAR( -->
<!--     obj=x.sp, -->
<!--     input.mat="gmat", -->
<!--     cluster.pos=cluster, -->
<!--     # cluster.neg = 5, -->
<!--     cluster.neg.method="random", -->
<!--     test.method="exactTest", -->
<!--     bcv=0.4, #0.4 for human, 0.1 for mouse -->
<!--     seed.use=42 -->
<!--   ); -->


<!-- gmat.cl <- x.sp@gmat[which(x.sp@cluster==cluster),] -->
<!-- gmat.neg <- x.sp@gmat[which(x.sp@cluster!=cluster),] -->

<!-- hist(as.matrix(gmat.cl), breaks=50) -->


<!-- # DARs %>% -->
<!--   # mutate(gene=colnames(x.sp@gmat)) %>% -->
<!--   # arrange() %>% -->
<!--   # filter(PValue < 0.05) -->

<!-- hist(DARs$PValue, breaks = 20) -->

<!-- DARs$FDR = p.adjust(DARs$PValue, method="BH") -->
<!-- idy = which(DARs$FDR < 5e-2 & DARs$logFC > 0) -->
<!-- plot(DARs$logCPM, DARs$logFC,  -->
<!--     pch=19, cex=0.1, col="grey",  -->
<!--     ylab="logFC", xlab="logCPM", -->
<!--     main=paste("Cluster", cluster) -->
<!--   ) -->
<!-- points(DARs$logCPM[idy],  -->
<!--     DARs$logFC[idy],  -->
<!--     pch=19,  -->
<!--     cex=0.5,  -->
<!--     col="red" -->
<!--   ) -->

<!-- ``` -->
```{r}
DAGs_cl <- FindAllMarkers(f74.seu, min.pct = 0.2, logfc.threshold = 0.3)
DAGs_cl <- 
  DAGs_cl %>%
  mutate(positive=avg_logFC > 0)
```
```{r, fig.height=14, fig.width=9}
top10_markers <- DAGs_cl %>%
  filter(positive) %>%
  group_by(cluster) %>%
  arrange(p_val_adj) %>%
  top_n(10) %>%
  pull(gene) %>%
  unique()

top10_markers <- DAGs_cl %>%
  filter(positive) %>%
  group_by(cluster) %>%
  # arrange(p_val_adj) %>%
  top_n(10,wt = - p_val_adj) %>%
  pull(gene)

DotPlot(f74.seu, features = top10_markers) +
  coord_flip()
```

Expression of markers in RNA data

```{r, fig.width=12, fig.height=16}
plot_marker_ATACvsRNA <- function(marker){
    ggpubr::ggarrange(
      FeaturePlot(f74.seu, feature=marker, reduction="umap.snap") + ggtitle("ATAC"), FeaturePlot(f74.rna.seu, feature=marker) + ggtitle("RNA")
    ) %>% annotate_figure(top=marker)
  }

plot_cluster_marker_ATACvsRNA <- function(cl){
  top10_cluster <-
    DAGs_cl %>%
    filter(positive) %>%
    filter(cluster==cl) %>%
    filter(gene %in% rownames(f74.rna.seu)) %>%
    top_n(10,wt = - p_val_adj) %>%
    pull(gene)
  ggarrange(plotlist = map(top10_cluster, ~ plot_marker_ATACvsRNA(.x)), ncol=2, nrow=5) %>% 
    annotate_figure(fig.lab=paste("Cluster", cl), fig.lab.size = 20, fig.lab.face = "bold")
  }

map(1:10, ~ plot_cluster_marker_ATACvsRNA(.x))
```

```{r}
plotly::ggplotly(DimPlot(f74.rna.seu, group.by = "annotation"))
```


## Embedding on most variable genes in ATAC (not in gene expression)
```{r}
DefaultAssay(f74.seu) <- "ACTIVITY"
f74.seu.hvgatac <- FindVariableFeatures(f74.seu)
f74.seu.hvgatac <- RunPCA(f74.seu.hvgatac, dims=1:30)
f74.seu.hvgatac <- RunUMAP(f74.seu.hvgatac, reduction = "pca", dims=1:30, verbose=FALSE) 

f74.seu.hvgatac <- AddMetaData(f74.seu.hvgatac, x.sp@cluster, col.name='SnapATAC_cluster')
DimPlot(f74.seu.hvgatac, reduction = "umap", group.by = "SnapATAC_cluster") 
```


## Thoughts

- There seems to be a correspondance for gene expression and accessibility in many "accessibility markers" (see Cluster 6) --> maybe integration would work better if I don't take only the most variable features in the RNA data. 



